KIDNEY RENAL CLEAR-CELL-CARCINOMA EXPRESSION ANALYSE

Andreu Bofill Pumarola (andreu.bofill01@estudiant.upf.edu)
Sergio Castillo Lara (sergio.castillo01@estudiant.upf.edu)
Adrià Pérez Culubret (adria.perez06@estudiant.upf.edu)

LOADING DATA

library(SummarizedExperiment)
## Loading required package: GenomicRanges
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(edgeR)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:IRanges':
## 
##     collapse
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## 
## Attaching package: 'genefilter'
## The following object is masked from 'package:base':
## 
##     anyNA
library(ggplot2)
library(geneplotter)
## Loading required package: lattice
## Loading required package: annotate
## Loading required package: AnnotationDbi
## Loading required package: XML
library(org.Hs.eg.db)
## Loading required package: DBI
## 
library(GOstats)
## Loading required package: Category
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:IRanges':
## 
##     expand
## Loading required package: GO.db
## 
## Loading required package: graph
## 
## Attaching package: 'graph'
## The following object is masked from 'package:XML':
## 
##     addNode
## 
## Attaching package: 'GOstats'
## The following object is masked from 'package:AnnotationDbi':
## 
##     makeGOGraph
library(biomaRt)
library(KEGG.db)
## 
## KEGG.db contains mappings based on older data because the original
##   resource was removed from the the public domain before the most
##   recent update was produced. This package should now be
##   considered deprecated and future versions of Bioconductor may
##   not have it available.  Users who want more current data are
##   encouraged to look at the KEGGREST or reactome.db packages
#library(e1071)
library(GSEABase)
library(GSVAdata)
## Loading required package: hgu95a.db
## 

We load the raw RNA-seq data counts set of Kidney renal clear-cell-carcinoma.

se <- readRDS(file.path("~/GitHub/KIDNEY/seKIRC.rds"))

se
## class: RangedSummarizedExperiment 
## dim: 20115 614 
## metadata(5): experimentData annotation cancerTypeCode
##   cancerTypeDescription objectCreationDate
## assays(1): counts
## rownames(20115): 1 2 ... 102724473 103091865
## rowRanges metadata column names(3): symbol txlen txgc
## colnames(614): TCGA.3Z.A93Z.01A.11R.A37O.07
##   TCGA.6D.AA2E.01A.11R.A37O.07 ... TCGA.CZ.5988.11A.01R.1672.07
##   TCGA.CZ.5989.11A.01R.1672.07
## colData names(549): type bcr_patient_uuid ...
##   lymph_nodes_aortic_pos_by_ihc lymph_nodes_aortic_pos_total

Exploring phenotypic data

We take a look to the column data, which include the phenotipic information of all samples of the sumarized experiment object.

dim(colData(se))
## [1] 614 549
colData(se)[1:5, 1:5]
## DataFrame with 5 rows and 5 columns
##                                  type                     bcr_patient_uuid
##                              <factor>                             <factor>
## TCGA.3Z.A93Z.01A.11R.A37O.07    tumor 2B1DEA0A-6D55-4FDD-9C1C-0D9FBE03BD78
## TCGA.6D.AA2E.01A.11R.A37O.07    tumor D3B47E53-6F40-4FC8-B5A4-CBE548A770A9
## TCGA.A3.3306.01A.01R.0864.07    tumor 9fb55e0b-43d8-40a3-8ef2-d198e6290551
## TCGA.A3.3307.01A.01R.0864.07    tumor 7ac1d6c6-9ade-49af-8794-10b5b96b2b05
## TCGA.A3.3308.01A.02R.1325.07    tumor 3cbca837-f5a7-4a87-8f02-c59eac232d5a
##                              bcr_patient_barcode form_completion_date
##                                         <factor>             <factor>
## TCGA.3Z.A93Z.01A.11R.A37O.07        TCGA-3Z-A93Z           2014-11-11
## TCGA.6D.AA2E.01A.11R.A37O.07        TCGA-6D-AA2E            2014-3-17
## TCGA.A3.3306.01A.01R.0864.07        TCGA-A3-3306            2010-8-23
## TCGA.A3.3307.01A.01R.0864.07        TCGA-A3-3307            2010-4-13
## TCGA.A3.3308.01A.02R.1325.07        TCGA-A3-3308            2010-4-12
##                              prospective_collection
##                                            <factor>
## TCGA.3Z.A93Z.01A.11R.A37O.07                    YES
## TCGA.6D.AA2E.01A.11R.A37O.07                    YES
## TCGA.A3.3306.01A.01R.0864.07                     NO
## TCGA.A3.3307.01A.01R.0864.07                     NO
## TCGA.A3.3308.01A.02R.1325.07                     NO

We also need to observe to the metadata information content of these data.

mcols(colData(se), use.names=TRUE)
## DataFrame with 549 rows and 2 columns
##                                                          labelDescription
##                                                               <character>
## type                                           sample type (tumor/normal)
## bcr_patient_uuid                                         bcr patient uuid
## bcr_patient_barcode                                   bcr patient barcode
## form_completion_date                                 form completion date
## prospective_collection            tissue prospective collection indicator
## ...                                                                   ...
## lymph_nodes_pelvic_pos_total                               total pelv lnp
## lymph_nodes_aortic_examined_count                           total aor lnr
## lymph_nodes_aortic_pos_by_he                          aln pos light micro
## lymph_nodes_aortic_pos_by_ihc                                 aln pos ihc
## lymph_nodes_aortic_pos_total                                total aor-lnp
##                                         CDEID
##                                   <character>
## type                                       NA
## bcr_patient_uuid                           NA
## bcr_patient_barcode                   2673794
## form_completion_date                       NA
## prospective_collection                3088492
## ...                                       ...
## lymph_nodes_pelvic_pos_total          3151828
## lymph_nodes_aortic_examined_count     3104460
## lymph_nodes_aortic_pos_by_he          3151832
## lymph_nodes_aortic_pos_by_ihc         3151831
## lymph_nodes_aortic_pos_total          3151827

As we can see there are three different columns on the metadata result. The first column is the name of all the variables; the second is the labelDescription, which is the definition of each variable; and the last one corresponding to the Common Data Element Identifier (CDEID).

Exploring feature data

We look through the data rows, which are the feature elements, to see the genes information of our data set.

rowRanges(se)
## GRanges object with 20115 ranges and 3 metadata columns:
##             seqnames               ranges strand   |      symbol     txlen
##                <Rle>            <IRanges>  <Rle>   | <character> <integer>
##           1    chr19 [58345178, 58362751]      -   |        A1BG      3322
##           2    chr12 [ 9067664,  9116229]      -   |         A2M      4844
##           9     chr8 [18170477, 18223689]      +   |        NAT1      2280
##          10     chr8 [18391245, 18401218]      +   |        NAT2      1322
##          12    chr14 [94592058, 94624646]      +   |    SERPINA3      3067
##         ...      ...                  ...    ... ...         ...       ...
##   100996331    chr15 [20835372, 21877298]      -   |       POTEB      1706
##   101340251    chr17 [40027542, 40027645]      -   |    SNORD124       104
##   101340252     chr9 [33934296, 33934376]      -   |   SNORD121B        81
##   102724473     chrX [49303669, 49319844]      +   |      GAGE10       538
##   103091865    chr21 [39313935, 39314962]      +   |   BRWD1-IT2      1028
##                  txgc
##             <numeric>
##           1 0.5644190
##           2 0.4882329
##           9 0.3942982
##          10 0.3895613
##          12 0.5249429
##         ...       ...
##   100996331 0.4308324
##   101340251 0.4903846
##   101340252 0.4074074
##   102724473 0.5055762
##   103091865 0.5924125
##   -------
##   seqinfo: 455 sequences (1 circular) from hg38 genome

DGE object

We create an object to hold the dataset to be analysed in a better and more comprehensive way.

dge <- DGEList(counts=assays(se)$counts, genes=mcols(se))
## Warning in as.data.frame.DataFrame(genes, stringsAsFactors = FALSE):
## Arguments in '...' ignored

Moreover, we calculate \(\log_2\) CPM values of expression and we save them in an assay element.

assays(se)$logCPM <- cpm(dge, log=TRUE, prior.count=0.5)

QUALITY AND NORMALIZATION

Library size filtering

At this point, we need to determine if the library sizes of tumor samples and normal samples are similar or not.

libsize <- data.frame(libsize = dge$sample$lib.size/1e6, type = se$type)

summary(libsize)
##     libsize            type    
##  Min.   :  3.049   normal: 72  
##  1st Qu.: 38.075   tumor :542  
##  Median : 46.715               
##  Mean   : 46.679               
##  3rd Qu.: 54.647               
##  Max.   :115.546
Figure S1: Density distribution of library size.

Figure S1: Density distribution of library size.

This past figure shows the density distribution on sequencing depth (milions of reads).

Figure S2: Histogram of library size distribution.

Figure S2: Histogram of library size distribution.

In this other figure, we can see now the proportion of samples in each sequencing depth (Milions of Reads).

As we can see, both figures show that the normal and tumor samples are similar in terms of sequencing depth, except for some samples that show extreme values.

Moreover, we can see how both normal and tumor types, follow a normal distribution through the sequencing depth.

At this point, we can filter out those samples with a low sequencing depth (<45 Milions of reads) because they are less reliable.

dge.filt <- dge[, dge$sample$lib.size/1e6 > 45 ]

final.samples <- rownames(dge.filt$samples)

se.filt <-se[,colnames(se) %in% final.samples]
se.normal <- se.filt[,se.filt$type == "normal"]
se.tumor  <- se.filt[,se.filt$type == "tumor"]


normal.code <- substr(colnames(se.normal), 9, 12)
tumor.code <- substr(colnames(se.tumor), 9, 12)

common.codes <- intersect(normal.code, tumor.code)
length(common.codes)
## [1] 38
se.paired <- se.filt[,substr(colnames(se.filt), 9, 12) %in% common.codes]
summary(se.paired$type)
## normal  tumor 
##     38     38
colData(se.paired)$samplecodes <- substr(colnames(se.paired), 9, 12)
dge.filt <- DGEList(counts=assays(se.paired)$counts, genes=mcols(se.paired))
## Warning in as.data.frame.DataFrame(genes, stringsAsFactors = FALSE):
## Arguments in '...' ignored
Figure S3: Library sizes in increasing order.

Figure S3: Library sizes in increasing order.

Genes filtering: Distribution of expression levels

par(mfrow=c(1,2), mar=c(4,5,1,1))

Expression levels comparison between all samples

In the next part, we take a look on the distribution of expression values per sample in terms of logarithmic CPM units.

Figure S4: Multidensity plot of log2CPM.

Figure S4: Multidensity plot of log2CPM.

We cannot observe significant diferences between samples in terms of expression levels.

As we have more than 200 samples, we now display the normal and tumor distribution separately.

Expression levels comparison between normal and tumor

normalCPM <- se.paired[,se.paired$type == "normal"]
tumorCPM <- se.paired[,se.paired$type == "tumor"]
Figure S5: Multidensity plot of normal samples log2CPM.

Figure S5: Multidensity plot of normal samples log2CPM.

Figure S6: Multidensity plot of tumor samples log2CPM.

Figure S6: Multidensity plot of tumor samples log2CPM.

Distribution of expression levels among genes

Now, we want to filter out the genes with a low expression. To do that, we can plot the CPMs in logarithmic scale.

genemean <- data.frame(Means=aveLogCPM(assays(se.paired)$counts))
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
Figure S7: Distribution of expression levels among genes.

Figure S7: Distribution of expression levels among genes.

We can see how there are two peaks of genes. The first one correspond to these genes with a very low expression and thus we decide to remove them. In other to do that, we used a minimum average of 1 LogCPM unit as a cutoff to filter out lowly expressed genes.

avgexp <- aveLogCPM(assays(se.paired)$counts)
mask <- avgexp > 1

These are the numbers of genes before filtering:

dim(se.paired) # Before filtering SE
## [1] 20115    76
dim(dge.filt) # Before filtering DGE
## [1] 20115    76

These are the numbers of samples genes after filtering:

se.paired.genes <- se.paired[mask, ]
dim(se.paired.genes) # After filtering SE
## [1] 12495    76
dge.filt <- dge.filt[mask, ]
dim(dge.filt) # After filtering DGE
## [1] 12495    76

At this point, we can calculate the normaliation factors on the filtered expression data set.

dgenorm <- calcNormFactors(dge.filt)
assays(se.paired.genes)$logCPMnorm <- cpm(dgenorm, log=TRUE, prior.count=0.5)

MA-plots

We look at the MA-plots of the normal samples to see if there are systematic biases in gene expression levels in any of the sampes. We expect the slope to be around zero.

Figure S8: MA-plots of normal samples.

Figure S8: MA-plots of normal samples.

After observing them, one normal sample has been identified with a biased gene expression, so we take it out of our dataset. This sample is the TGCA-CW-5591.

dim(se.paired.genes) # Before filtering SE
## [1] 12495    76
dim(dge.filt) # Before filtering DGE
## [1] 12495    76
se.paired.genes <- se.paired.genes[,!se.paired.genes$bcr_patient_barcode %in% "TCGA-CW-5591"]
dim(se.paired.genes) # After filtering SE
## [1] 12495    74
dge.filt <- dge.filt[, !substr(rownames(dge.filt$samples), 1, 12) %in% "TCGA.CW.5591"]
dim(dge.filt) # After filtering DGE
## [1] 12495    74
dgenorm <- dgenorm[, !substr(rownames(dgenorm$samples), 1, 12) %in% "TCGA.CW.5591"]
dim(dgenorm) # After filtering DGE
## [1] 12495    74

Now, we examine the tumor samples:

Figure S9: MA-plots of tumor samples.

Figure S9: MA-plots of tumor samples.

As we can see, we don’t see any tumor samples with major biases in gene expression

Batch identification

Now we’re going to analyze our data in order to search for batch effect that could interfere with the biological signal. First, we analyze some of the information contained in the barcode, such as tissue source site, center of sequenciation, plate and portion and analyte combinations. We use two approaches to try to identify batch effect, the Hierarchical Clustering and a Multidimensional Scaling plot.

tss <- substr(colnames(se.paired.genes), 6, 7)
table(tss)
## tss
## B0 B2 B8 CJ CW CZ 
## 14  2  4  4 14 36
center <- substr(colnames(se.paired.genes), 27, 28)
table(center)
## center
## 07 
## 74
plate <- substr(colnames(se.paired.genes), 22, 25)
table(plate)
## plate
## 1503 1541 1672 
##   38   18   18
portionanalyte <- substr(colnames(se.paired.genes), 18, 20)
table(portionanalyte)
## portionanalyte
## 01R 02R 11R 
##  58   4  12
samplevial <- substr(colnames(se.paired.genes), 14, 16)
table(samplevial)
## samplevial
## 01A 01B 11A 
##  36   1  37

GENDER batch effect

table(data.frame(TYPE=se.paired.genes$type, GENDER=se.paired.genes$gender))
##         GENDER
## TYPE     FEMALE MALE
##   normal     13   24
##   tumor      13   24
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.paired.genes$gender))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TSS", sort(unique(batch)), levels(factor(se.paired.genes$gender))), fill=sort(unique(batch)))
Figure S10: Hierarchical clustering of samples.

Figure S10: Hierarchical clustering of samples.

plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("Gender", sort(unique(batch)), levels(factor(se.paired.genes$gender))),
       fill=sort(unique(batch)), inset=0.05)
Figure S11: Multidimensional scaling plot of samples separated by gender.

Figure S11: Multidimensional scaling plot of samples separated by gender.

TSS batch effect

table(data.frame(TYPE=se.paired.genes$type, TSS=tss))
##         TSS
## TYPE     B0 B2 B8 CJ CW CZ
##   normal  7  1  2  2  7 18
##   tumor   7  1  2  2  7 18
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(tss))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TSS", sort(unique(batch)), levels(factor(tss))), fill=sort(unique(batch)))
Figure S12: Hierarchical clustering of samples separated by TSS.

Figure S12: Hierarchical clustering of samples separated by TSS.

plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("TSS", sort(unique(batch)), levels(factor(tss))),
       fill=sort(unique(batch)), inset=0.05)
Figure S13: Multidimensional scaling plot of samples separated by TSS.

Figure S13: Multidimensional scaling plot of samples separated by TSS.

PLATE batch effect

table(data.frame(TYPE=se.paired.genes$type, PLATE=plate))
##         PLATE
## TYPE     1503 1541 1672
##   normal   19    9    9
##   tumor    19    9    9
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(plate))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("PLATE", sort(unique(batch)), levels(factor(plate))), fill=sort(unique(batch)))
Figure S14: Hierarchical clustering of the samples separated by PLATE.

Figure S14: Hierarchical clustering of the samples separated by PLATE.

plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("plate", sort(unique(batch)), levels(factor(plate))),
       fill=sort(unique(batch)), inset=0.05)
Figure S15: Multidimensional scaling plot of the samples separated by PLATE.

Figure S15: Multidimensional scaling plot of the samples separated by PLATE.

PORTION-ANALYTE batch effect

table(data.frame(TYPE=se.paired.genes$type, P.analyte = portionanalyte))
##         P.analyte
## TYPE     01R 02R 11R
##   normal  35   2   0
##   tumor   23   2  12
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(portionanalyte))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("Portionanalyte", sort(unique(batch)), levels(factor(portionanalyte))), fill=sort(unique(batch)))
Figure S16: Hierarchical clustering of the samples separated by PortionAnalyte.

Figure S16: Hierarchical clustering of the samples separated by PortionAnalyte.

plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("Portionanalyte", sort(unique(batch)), levels(factor(portionanalyte))),
       fill=sort(unique(batch)), inset=0.05)
Figure S17: Multidimensional scaling plot of samples separated by PortionAnalyte.

Figure S17: Multidimensional scaling plot of samples separated by PortionAnalyte.

The Hierarchical clustering and the Multidimensional Scaling plots show a clear differentiation between normal samples and tumor samples. All four plots from the different elements of the barcode have a similar clustering and distribution. There is no clear effect from the batch indicators in the clustering, and it seems to have a balanced design (more clearly in tumor than in normal), although it seems that 9 tumor samples cluster apart from the other samples in the hierarchical clustering, so we should consider taking apart this samples.

DIFFERENTIAL EXPRESSION ANALYSIS

In order to identify differentially expressed genes, we used a linear model with two factors: one being the cell type (tumor or normal) and the other being the individual/sample.

We chose a False Discovery Rate of 0.01 (1%). We corrected#### Over represented KEGGs

Mean variance relationship

design <- model.matrix(~type + samplecodes, data = colData(se.paired.genes))
v <- voom(dgenorm, design, plot=FALSE)
dim(se.paired.genes)
## [1] 12495    74
dim(dgenorm)
## [1] 12495    74
FDRcutoff <- 0.01
mod0 <- model.matrix(~samplecodes, colData(se.paired.genes))
sv <- sva(v$E, mod = design, mod0 = mod0)
## Number of significant surrogate variables is:  7 
## Iteration (out of 5 ):1  2  3  4  5
design.voom <- ""
design.voom <- cbind(design, sv$sv)


fit <- lmFit(v, design.voom)
fit <- eBayes(fit)
res <- decideTests(fit, p.value = FDRcutoff)
Figure S18: Volcano plot of DE analysis.

Figure S18: Volcano plot of DE analysis.

colnames(design)
toptable <- topTable(fit, coef = 2, n=Inf)
Figure S19: Pvalue distribution and qqplot.

Figure S19: Pvalue distribution and qqplot.

Over vs under

summary(toptable)
##     symbol              txlen            txgc            logFC          
##  Length:12495       Min.   :   43   Min.   :0.2133   Min.   :-12.46608  
##  Class :character   1st Qu.: 1738   1st Qu.:0.4223   1st Qu.: -0.40791  
##  Mode  :character   Median : 2970   Median :0.4858   Median :  0.07708  
##                     Mean   : 3592   Mean   :0.4954   Mean   :  0.02842  
##                     3rd Qu.: 4738   3rd Qu.:0.5654   3rd Qu.:  0.54939  
##                     Max.   :91667   Max.   :0.8636   Max.   : 10.11549  
##     AveExpr             t               P.Value         
##  Min.   :-5.464   Min.   :-34.1775   Min.   :0.0000000  
##  1st Qu.: 3.199   1st Qu.: -4.6644   1st Qu.:0.0000000  
##  Median : 4.901   Median :  0.9748   Median :0.0000023  
##  Mean   : 4.624   Mean   :  0.9532   Mean   :0.0777713  
##  3rd Qu.: 6.139   3rd Qu.:  6.6006   3rd Qu.:0.0134982  
##  Max.   :11.941   Max.   : 45.0974   Max.   :0.9982831  
##    adj.P.Val               B         
##  Min.   :0.0000000   Min.   :-8.491  
##  1st Qu.:0.0000000   1st Qu.:-5.036  
##  Median :0.0000045   Median : 3.431  
##  Mean   :0.0832298   Mean   : 6.350  
##  3rd Qu.:0.0179971   3rd Qu.:15.017  
##  Max.   :0.9982831   Max.   :60.630
Figure S20: Plot of logFC distribution.

Figure S20: Plot of logFC distribution.

under_exp <- toptable[toptable$logFC <= -5 ,]
over_exp  <- toptable[toptable$logFC >= 5,] 

under_exp.all <- toptable[toptable$logFC <= 0 ,]
over_exp.all  <- toptable[toptable$logFC >= 0,] 

test_overexp <- toptable[toptable$logFC >= 3,]  

dim(under_exp)
## [1] 153   9
dim(over_exp)
## [1] 46  9
head(over_exp[order(over_exp$logFC, decreasing = TRUE),])
##        symbol txlen      txgc     logFC    AveExpr        t      P.Value
## 2184      FAH  3427 0.5170703 10.115493  0.8164831 16.72456 1.562600e-17
## 81617  CAB39L  3692 0.4320152  9.104303  3.4604998 22.65289 1.746516e-21
## 389320  TEX43   464 0.4030172  8.527648 -1.9867210 17.35212 5.303748e-18
## 973     CD79A  1258 0.6073132  8.336017  0.9896687 26.65775 1.142480e-23
## 5965    RECQL  3702 0.3725014  7.984984  2.0659290 17.20022 6.869086e-18
## 401387  LRRD1  2830 0.3395760  7.622375 -0.7109079 18.74206 5.402535e-19
##           adj.P.Val        B
## 2184   2.532386e-16 29.46327
## 81617  1.536811e-19 38.72011
## 389320 1.022690e-16 29.64651
## 973    2.833658e-21 42.55773
## 5965   1.275323e-16 30.74923
## 401387 1.480366e-17 32.07925
head(under_exp[order(under_exp$logFC, decreasing = FALSE),])
##         symbol txlen      txgc     logFC  AveExpr         t      P.Value
## 400713  ZNF880  2230 0.3995516 -12.46608 6.106883 -22.48152 2.203720e-21
## 361       AQP4  5274 0.3795980 -12.09207 5.307332 -22.84434 1.349424e-21
## 125875  CLDND2   974 0.6190965 -11.38629 1.464706 -24.75417 1.138355e-22
## 2592      GALT  2566 0.5420889 -10.99625 2.085865 -24.36734 1.852009e-22
## 8649   LAMTOR3  4226 0.3774255 -10.96988 3.871072 -22.36824 2.572036e-21
## 391196   OR2M7   939 0.4675186 -10.65841 1.105295 -27.39939 4.857318e-24
##           adj.P.Val        B
## 400713 1.835699e-19 38.66244
## 361    1.286424e-19 38.94593
## 125875 1.725189e-20 39.02897
## 2592   2.461793e-20 39.31175
## 8649   2.034024e-19 38.14609
## 391196 1.619172e-21 41.34597
toptable[toptable$symbol == "PBRM1",]
##       symbol txlen      txgc    logFC  AveExpr       t      P.Value
## 55193  PBRM1  7523 0.4035624 4.146581 2.805781 18.2486 1.195662e-18
##          adj.P.Val        B
## 55193 2.873037e-17 32.50217
toptable[toptable$symbol == "SETD2",]
##       symbol txlen      txgc    logFC  AveExpr        t      P.Value
## 29072  SETD2  8172 0.4274351 3.342995 1.995913 19.56703 1.487443e-19
##       adj.P.Val        B
## 29072  5.28e-18 34.61302
toptable[toptable$symbol == "AKT",]
## [1] symbol    txlen     txgc      logFC     AveExpr   t         P.Value  
## [8] adj.P.Val B        
## <0 rows> (or 0-length row.names)
toptable[toptable$symbol == "VHL1",]
## [1] symbol    txlen     txgc      logFC     AveExpr   t         P.Value  
## [8] adj.P.Val B        
## <0 rows> (or 0-length row.names)
# PBRM1 i SETD2 is over y es BIEN.

Gene set expression analysis

data(c2BroadSets)

# Filtering to only KEGG, Reactome and BioCarta
gsc <- c2BroadSets[c(grep("^KEGG", names(c2BroadSets)),
               grep("^REACTOME", names(c2BroadSets)), grep("^BIOCARTA", names(c2BroadSets)))]

gsc <- mapIdentifiers(gsc, AnnoOrEntrezIdentifier(metadata(se.paired.genes)$annotation))
gsmatrix <- incidence(gsc)
dim(gsmatrix)
## [1]  833 6744
# Discard genes not in our data and genes not in the gene sets

gsmatrix <- gsmatrix[, colnames(gsmatrix) %in% rownames(se.paired.genes)]
dim(gsmatrix)
## [1]  833 4038
se.paired.gsea <- se.paired.genes[colnames(gsmatrix), ]
dim(se.paired.genes)
## [1] 12495    74
dim(se.paired.gsea)
## [1] 4038   74
dgenorm.gsea <- dgenorm[colnames(gsmatrix), ]
dim(dgenorm)
## [1] 12495    74
dim(dgenorm.gsea)
## [1] 4038   74
## Filtering gene sets with a minimum size of 5
gsmatrix <- gsmatrix[rowSums(gsmatrix) >= 5, ]
dim(gsmatrix)
## [1]  807 4038
design <- model.matrix(~type + samplecodes, data = colData(se.paired.gsea))

v <- voom(dgenorm, design, plot=FALSE)
FDRcutoff <- 0.01
mod0 <- model.matrix(~samplecodes, colData(se.paired.gsea))
sv <- sva(v$E, mod = design, mod0 = mod0)
## Number of significant surrogate variables is:  7 
## Iteration (out of 5 ):1  2  3  4  5
design.voom <- ""
design.voom <- cbind(design, sv$sv)

fit <- lmFit(v, design.voom)
fit <- eBayes(fit)
toptable.gsea <- topTable(fit, coef = 2, n=Inf)


tGSgenes <- toptable.gsea[match(colnames(gsmatrix), rownames(toptable.gsea)), "t"]
length(tGSgenes)
## [1] 4038
head(tGSgenes)
## [1]  -4.768989  -4.177214   1.941461  12.904205 -11.136535   2.767150
zS <- sqrt(rowSums(gsmatrix)) * (as.vector(gsmatrix %*% tGSgenes)/rowSums(gsmatrix))
length(zS)
## [1] 807
head(zS)
##               KEGG_GLYCOLYSIS_GLUCONEOGENESIS 
##                                    -15.558933 
##                  KEGG_CITRATE_CYCLE_TCA_CYCLE 
##                                     -1.217683 
##                KEGG_PENTOSE_PHOSPHATE_PATHWAY 
##                                     -1.592216 
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS 
##                                     -4.944474 
##          KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM 
##                                    -13.367401 
##                     KEGG_GALACTOSE_METABOLISM 
##                                    -14.827441
Figure S21: GSEA qqplot of normal samples.

Figure S21: GSEA qqplot of normal samples.

rnkGS <- sort(abs(zS), decreasing = TRUE)
head(rnkGS,20)
##                      KEGG_OXIDATIVE_PHOSPHORYLATION 
##                                            51.25554 
##                       KEGG_PRIMARY_IMMUNODEFICIENCY 
##                                            41.91329 
##        REACTOME_DOWNSTREAM_EVENTS_IN_GPCR_SIGNALING 
##                                            40.84898 
##       REACTOME_SLC_MEDIATED_TRANSMEMBRANE_TRANSPORT 
##                                            39.72385 
##         KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 
##                                            35.65341 
##                     KEGG_JAK_STAT_SIGNALING_PATHWAY 
##                                            34.85214 
##                          KEGG_FATTY_ACID_METABOLISM 
##                                            33.10837 
##                             KEGG_PATHWAYS_IN_CANCER 
##                                            32.33795 
##             KEGG_VASCULAR_SMOOTH_MUSCLE_CONTRACTION 
##                                            32.31561 
## REACTOME_TRANSMEMBRANE_TRANSPORT_OF_SMALL_MOLECULES 
##                                            32.18810 
##   REACTOME_GLUCOSE_AND_OTHER_SUGAR_SLC_TRANSPORTERS 
##                                            31.77223 
##                          BIOCARTA_CELLCYCLE_PATHWAY 
##                                            31.56679 
##        REACTOME_CHEMOKINE_RECEPTORS_BIND_CHEMOKINES 
##                                            31.51400 
##                REACTOME_G_ALPHA_Q_SIGNALLING_EVENTS 
##                                            31.31659 
##                        BIOCARTA_TCAPOPTOSIS_PATHWAY 
##                                            31.13932 
##                         KEGG_SMALL_CELL_LUNG_CANCER 
##                                            30.89871 
##                     KEGG_HEMATOPOIETIC_CELL_LINEAGE 
##                                            30.09452 
##                            BIOCARTA_THELPER_PATHWAY 
##                                            29.81463 
##                              REACTOME_PD1_SIGNALING 
##                                            29.49595 
##                         BIOCARTA_SALMONELLA_PATHWAY 
##                                            29.44496
## plot stuff
plotGS <- function(se, gs, pheno, ...) {
    l <- levels(colData(se)[, pheno])
    idxSamples1 <- colData(se)[, pheno] == l[1]
    idxSamples2 <- colData(se)[, pheno] == l[2]
    exps1 <- rowMeans(assays(se)$logCPM[gs, idxSamples1])
    exps2 <- rowMeans(assays(se)$logCPM[gs, idxSamples2])
    rng <- range(c(exps1, exps2))
    plot(exps1, exps2, pch = 21, col = "black", bg = "black", xlim = rng, ylim = rng, 
        xlab = l[1], ylab = l[2], ...)
    abline(a = 0, b = 1, lwd = 2, col = "red")
}
Figure S22: Scatter plots of gene sets sorted by z-score.

Figure S22: Scatter plots of gene sets sorted by z-score.

Functional enrichment

Over represented GOs

DEgenes <- rownames(toptable)[toptable$adj.P.Val < 0.01]
length(DEgenes)
## [1] 9108
geneUniverse <- rownames(se.paired.genes)
params <- new("GOHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", ontology="BP",
              pvalueCutoff=0.01, testDirection="over")

conditional(params) <- TRUE
hgOver <- hyperGTest(params)
GOresults <- summary(hgOver)
GOresults <- GOresults[GOresults$Size >= 5 & GOresults$Count >= 5,]

GOresults <- GOresults[order(GOresults$OddsRatio, decreasing = TRUE), ]
GOresults <- GOresults[order(GOresults$Pvalue, decreasing = FALSE), ]

htmlReport(hgOver, file = "gotests.html")

GOs (Overexpressed DE)

DEgenes <- rownames(over_exp.all)[over_exp.all$adj.P.Val < 0.01]
length(DEgenes)
## [1] 5094
geneUniverse <- rownames(se.paired.genes)
params <- new("GOHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", ontology="BP",
              pvalueCutoff=0.01, testDirection="over")

conditional(params) <- TRUE
hgOver <- hyperGTest(params)
GOresultsover <- summary(hgOver)
GOresultsover <- GOresultsover[GOresultsover$Size >= 5 & GOresultsover$Count >= 5,]

GOresultsover <- GOresultsover[order(GOresultsover$OddsRatio, decreasing = TRUE), ]
GOresultsover <- GOresultsover[order(GOresultsover$Pvalue, decreasing = FALSE), ]

head(GOresultsover)
##       GOBPID       Pvalue OddsRatio  ExpCount Count Size
## 1 GO:0051301 1.768797e-05  1.533603 165.24901   206  402
## 2 GO:0051251 3.038639e-05  2.063637  53.84980    77  131
## 3 GO:0007156 3.677431e-05  2.469144  34.52964    53   84
## 4 GO:0008283 8.589732e-05  1.275001 459.16206   518 1117
## 5 GO:0032946 1.057712e-04  2.633582  26.71937    42   65
## 6 GO:0050867 1.128832e-04  1.883097  59.60474    82  145
##                                                              Term
## 1                                                   cell division
## 2                    positive regulation of lymphocyte activation
## 3 homophilic cell adhesion via plasma membrane adhesion molecules
## 4                                              cell proliferation
## 5           positive regulation of mononuclear cell proliferation
## 6                          positive regulation of cell activation
htmlReport(hgOver, file = "gotestsover.html")

GOs (Underexpressed DE)

DEgenes <- rownames(under_exp.all)[under_exp.all$adj.P.Val < 0.01]
length(DEgenes)
## [1] 4014
geneUniverse <- rownames(se.paired.genes)
params <- new("GOHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", ontology="BP",
              pvalueCutoff=0.01, testDirection="over")

conditional(params) <- TRUE
hgUnder <- hyperGTest(params)
GOresultsunder <- summary(hgUnder)
GOresultsunder <- GOresultsunder[GOresultsunder$Size >= 5 & GOresultsunder$Count >= 5,]

GOresultsunder <- GOresultsunder[order(GOresultsunder$OddsRatio, decreasing = TRUE), ]
GOresultsunder <- GOresultsunder[order(GOresultsunder$Pvalue, decreasing = FALSE), ]

head(GOresultsunder)
##       GOBPID       Pvalue OddsRatio  ExpCount Count Size
## 1 GO:0007210 9.939131e-06 25.749672  4.143125    12   13
## 2 GO:0009145 9.815811e-05  4.534902  8.923653    19   28
## 3 GO:0009201 9.815811e-05  4.534902  8.923653    19   28
## 4 GO:0022904 1.096097e-04  2.731924 18.803412    33   59
## 5 GO:0006754 2.123284e-04  4.294813  8.604951    18   27
## 6 GO:0015991 2.123284e-04  4.294813  8.604951    18   27
##                                                  Term
## 1                serotonin receptor signaling pathway
## 2 purine nucleoside triphosphate biosynthetic process
## 3    ribonucleoside triphosphate biosynthetic process
## 4                respiratory electron transport chain
## 5                            ATP biosynthetic process
## 6             ATP hydrolysis coupled proton transport
htmlReport(hgUnder, file = "gotestsunder.html")

Over represented KEGGs

DEgenes <- rownames(toptable)[toptable$adj.P.Val < 0.01]
length(DEgenes)
## [1] 9108
geneUniverse <- rownames(se.paired.genes)
params <- new("KEGGHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
              annotation="org.Hs.eg.db", pvalueCutoff=0.01, testDirection="over")

hgOver <- hyperGTest(params)

hgOver
## Gene to KEGG  test for over-representation 
## 228 KEGG ids tested (6 have p < 0.01)
## Selected gene set size: 2577 
##     Gene universe size: 3494 
##     Annotation package: org.Hs.eg
KEGGresults <- summary(hgOver)

# Filtering by counts and size
KEGGresults <- KEGGresults[KEGGresults$Size >= 5 & KEGGresults$Count >= 5, ]


KEGGresults <- KEGGresults[order(KEGGresults$OddsRatio, decreasing = TRUE), ]
KEGGresults <- KEGGresults[order(KEGGresults$Pvalue, decreasing = FALSE), ]

head(KEGGresults)
##   KEGGID      Pvalue OddsRatio ExpCount Count Size
## 1  05340 0.001637912       Inf 15.48855    21   21
## 2  04660 0.002350134  3.126865 42.77790    52   58
## 3  05160 0.005940497  2.167665 61.95421    72   84
## 4  00650 0.006277214  8.249021 17.70120    23   24
## 5  00410 0.007574224       Inf 11.80080    16   16
## 6  04144 0.008741059  1.776341 91.45621   103  124
##                                Term
## 1          Primary immunodeficiency
## 2 T cell receptor signaling pathway
## 3                       Hepatitis C
## 4              Butanoate metabolism
## 5           beta-Alanine metabolism
## 6                       Endocytosis

When performing the functional enrichment analysis, some fo the gene sets have an OddsRatio value of Infinite. Which is the biological meaning of this?

HIERARCHICAL CLUSTERING USING DE GENES

Then, we performed a Hierarchical clustering and we plotted a Multidimensional scale plot using only the 77 D.E. genes.

head(colnames(colData(se.paired.genes)))
## [1] "type"                     "bcr_patient_uuid"        
## [3] "bcr_patient_barcode"      "form_completion_date"    
## [5] "prospective_collection"   "retrospective_collection"
# Get a mask with DE genes to filter SE object
DE.genes <-  rownames(se.paired.genes) %in% rownames(over_exp) | 
             rownames(se.paired.genes) %in% rownames(under_exp)

# Filter SE object using DE.genes (only paired samples)
se.DE  <- se.paired.genes[DE.genes,]
dgenorm.DE <- dgenorm[DE.genes,]

# Filter SE object using DE.genes (all samples)
se.all.DE      <- se[DE.genes,]
dgenorm.all.DE <- dge[DE.genes,]
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.DE)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.DE$type))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.DE)
outcome <- as.character(se.DE$type)
names(outcome) <- colnames(se.DE)
sampleDendrogram <- dendrapply(sampleDendrogram,
                               function(x, batch, labels) {
                                 if (is.leaf(x)) {
                                   attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
                                   attr(x, "label") <- as.vector(labels[attr(x, "label")])
                                 }
                                 x
                               }, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TYPE", sort(unique(batch)), levels(factor(se.DE$type))), fill=sort(unique(batch)))
Figure S23: Hierarchical clustering of the samples used in the analysis using only the DE genes.

Figure S23: Hierarchical clustering of the samples used in the analysis using only the DE genes.

#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.all.DE)$logCPM, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.all.DE$type))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.all.DE)
outcome <- as.character(se.all.DE$type)
names(outcome) <- colnames(se.all.DE)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TYPE", sort(unique(batch)), levels(factor(se.all.DE$type))), fill=sort(unique(batch)))
Figure S24: Hierarchical clustering of all samples using only the DE genes.

Figure S24: Hierarchical clustering of all samples using only the DE genes.

SESSION INFORMATION

sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.4 LTS
## 
## locale:
##  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_ES.UTF-8        LC_COLLATE=es_ES.UTF-8    
##  [5] LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=es_ES.UTF-8   
##  [7] LC_PAPER=es_ES.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] xtable_1.8-2               GSVAdata_1.6.0            
##  [3] hgu95a.db_3.2.2            GSEABase_1.32.0           
##  [5] KEGG.db_3.2.2              biomaRt_2.26.1            
##  [7] GOstats_2.36.0             graph_1.48.0              
##  [9] Category_2.36.0            GO.db_3.2.2               
## [11] Matrix_1.2-6               org.Hs.eg.db_3.2.3        
## [13] RSQLite_1.0.0              DBI_0.4                   
## [15] geneplotter_1.48.0         annotate_1.48.0           
## [17] XML_3.98-1.1               AnnotationDbi_1.32.3      
## [19] lattice_0.20-33            ggplot2_2.1.0             
## [21] sva_3.18.0                 genefilter_1.52.1         
## [23] mgcv_1.8-12                nlme_3.1-128              
## [25] edgeR_3.12.1               limma_3.26.9              
## [27] SummarizedExperiment_1.0.2 Biobase_2.30.0            
## [29] GenomicRanges_1.22.4       GenomeInfoDb_1.6.3        
## [31] IRanges_2.4.8              S4Vectors_0.8.11          
## [33] BiocGenerics_0.16.1       
## 
## loaded via a namespace (and not attached):
##  [1] splines_3.3.0          colorspace_1.2-6       htmltools_0.3.5       
##  [4] survival_2.39-4        RBGL_1.46.0            RColorBrewer_1.1-2    
##  [7] plyr_1.8.3             stringr_1.0.0          zlibbioc_1.16.0       
## [10] munsell_0.4.3          gtable_0.2.0           evaluate_0.9          
## [13] labeling_0.3           knitr_1.12.3           Rcpp_0.12.4           
## [16] KernSmooth_2.23-15     scales_0.4.0           formatR_1.3           
## [19] XVector_0.10.0         digest_0.6.9           stringi_1.0-1         
## [22] grid_3.3.0             tools_3.3.0            bitops_1.0-6          
## [25] magrittr_1.5           RCurl_1.95-4.8         rmarkdown_0.9.6       
## [28] AnnotationForge_1.12.2